home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
PC World Komputer 2010 April
/
PCWorld0410.iso
/
pluginy Firefox
/
8614
/
8614.xpi
/
modules
/
application
/
extractors
/
DefaultTextExtractor.jsm
< prev
Wrap
Text File
|
2010-02-10
|
28KB
|
1,058 lines
// DO NOT import this into the global namespace, but instead
// import it into your own namespace wrapper
var EXPORTED_SYMBOLS = ["DefaultTextExtractor"];
Components.utils.import("resource://glydo/utils/prototype_xul_1_6_0_3_modified.jsm");
Components.utils.import("resource://glydo/utils/Utils.jsm");
Components.utils.import("resource://glydo/utils/Prefs.jsm");
DefaultTextExtractor = {};
DefaultTextExtractor.TAG_TYPES = {
"a": "anchor",
"applet": "skip",
"base": "skip",
"blockquote": "par",
"br": "line-break",
"body": "div",
"caption": "par",
"code": "skip",
"dd": "line-break",
"del": "skip",
"dir": "par",
"div": "div",
"dt": "par-start",
"dl": "par",
"embed": "skip",
"frame": "skip",
"h1": "header",
"h2": "header",
"h3": "header",
"h4": "header",
"h5": "header",
"h6": "header",
"hr": "line-break",
"iframe": "skip",
"img": "skip",
"li": "par",
"link": "skip",
"map": "skip",
"menu": "par",
"noframes": "skip",
"noscript": "skip",
"object": "skip",
"ol": "par",
"p": "par",
"pre": "par",
"samp": "skip",
"script": "skip",
"select": "skip",
"style": "skip",
"table": "div",
"head": "skip",
"td": "par",
"th": "par",
"var": "skip"
};
DefaultTextExtractor.STYLE_PROPERTIES = [
["fontWeight","font-weight"],
["fontSize","font-size"]
];
DefaultTextExtractor.Container = Prototype.Class.create({
initialize: function(depth,separator) {
this.objects = [];
this.headerCandidates = [];
this.depth = depth;
this.score = null;
this.nTextWordsCount = null;
this.nLinkWordsCount = null;
this.nLocalTextWordsCount = null;
this.nLocalLinkWordsCount = null;
if (separator === undefined) {
this.separator = "";
} else {
this.separator = separator;
}
},
isLocked: function() {
return this.nTextWordsCount !== null ||
this.nLinkWordsCount !== null ||
this.score !== null;
},
getTextWordsCount: function() {
if (this.nTextWordsCount === null) {
var n = 0;
this.objects.forEach(function(o) {
n += o.getTextWordsCount();
},this);
this.nTextWordsCount = n;
}
return this.nTextWordsCount;
},
getLocalTextWordsCount: function() {
if (this.nLocalTextWordsCount === null) {
var n = 0;
this.objects.forEach(function(o) {
if (o instanceof DefaultTextExtractor.Paragraph) {
n += o.getTextWordsCount();
}
},this);
this.nLocalTextWordsCount = n;
}
return this.nLocalTextWordsCount;
},
getLinkWordsCount: function() {
if (this.nLinkWordsCount === null) {
var n = 0;
this.objects.forEach(function(o) {
n += o.getLinkWordsCount();
},this);
this.nLinkWordsCount = n;
}
return this.nLinkWordsCount;
},
getLocalLinkWordsCount: function() {
if (this.nLocalLinkWordsCount === null) {
var n = 0;
this.objects.forEach(function(o) {
if (o instanceof DefaultTextExtractor.Paragraph) {
n += o.getLinkWordsCount();
}
},this);
this.nLocalLinkWordsCount = n;
}
return this.nLocalLinkWordsCount;
},
isEmpty: function() {
return this.getTextWordsCount() == 0;
},
add: function(item) {
if (this.isLocked()) {
throw "Cannot modify container once scores have been calculated";
}
this.objects.push(item);
},
addHeaderCandidate: function(item) {
this.headerCandidates.push(item);
},
shouldPrune: function() {
return this.isEmpty();
},
toXml: function(doc,params,name) {
if (!params || !params.dontPrune) {
if (this.shouldPrune()) {
return null;
}
}
if (this.isEmpty()) {
return null;
}
if (name === undefined) {
name = "container";
}
var elem = doc.createElement(name);
if (this.score != undefined) {
elem.setAttribute("score", this.score);
}
for (var oi = 0; oi < this.objects.length; ++oi) {
var xml = this.objects[oi].toXml(doc,params);
if (xml !== null) {
elem.appendChild(xml);
}
}
return elem;
},
toText: function() {
var res = [];
for (var oi = 0; oi < this.objects.length; ++oi) {
res.push(this.objects[oi].toText());
}
return res.join(this.separator);
},
containsAnyOf: function(searchStrings) {
return Utils.containsAnyOf(this.toText(),searchStrings);
},
calcScore: function() {
this.score = 0;
this.objects.forEach(Prototype.F.bind(
function(o) {
if (o.calcScore !== undefined) {
o.calcScore();
if (!o.shouldPrune()) {
this.score += o.score;
}
}
},this)
);
},
collect: function(predicate,resultList) {
if (resultList === undefined) {
resultList = [];
}
for (var oi = 0; oi < this.objects.length; ++oi) {
var o = this.objects[oi];
if (predicate(o)) {
resultList.push(o);
} else if (o.collect !== undefined) {
o.collect(predicate,resultList);
}
}
return resultList;
},
collectHeaderCandidates: function(resultList) {
if (resultList === undefined) {
resultList = [];
}
for (var i = 0; i < this.headerCandidates.length; ++i) {
resultList.push(this.headerCandidates[i]);
}
for (var oi = 0; oi < this.objects.length; ++oi) {
var o = this.objects[oi];
if (o instanceof DefaultTextExtractor.Container) {
o.collectHeaderCandidates(resultList);
}
}
return resultList;
},
collectLocal: function(predicate,resultList) {
if (resultList === undefined) {
resultList = [];
}
for (var oi = 0; oi < this.objects.length; ++oi) {
var o = this.objects[oi];
if (predicate(o)) {
resultList.push(o);
}
}
return resultList;
}
});
DefaultTextExtractor.Text = Prototype.Class.create({
initialize: function(container,parentElement,text,isLink,remLeadingSpaces) {
this.container = container;
this.text = "";
this.isLink = isLink;
this.remLeadingSpaces = remLeadingSpaces;
this.storeStyle(parentElement);
this.addText(text);
},
setRemLeadingSpaces: function(remLeadingSpaces) {
this.remLeadingSpaces = remLeadingSpaces;
},
storeStyle: function(element) {
var style = {};
DefaultTextExtractor.STYLE_PROPERTIES.forEach(function(p) {
var s = Prototype.E.getStyle(element,p[1]);
if (s !== null) {
style[p[0]] = s;
}
},this);
this.style = style;
},
isCompatibleWith: function(otherText) {
if (otherText.isLink !== this.isLink) {
return false;
}
return this.stylesCompatible(this.style,otherText.style);
},
addText: function(text) {
this.text += text;
var words = this.text.split(/\s+/);
var n = 0;
for (var i = 0; i < words.length; ++i) {
if (words[i].length != 0) {
n++;
}
}
this.nWords = n;
this.strippedText = this.text.replace(/\s+/g," ");
if (this.remLeadingSpaces) {
this.lstrip();
}
var fontSize = Utils.getPixelsFromStyleSizeStr(this.style["fontSize"]);
if (fontSize >= 16 && this.style["fontWeight"] == "bold") {
this.container.addHeaderCandidate(this);
}
},
rstrip: function() {
var l = this.strippedText.length;
if ((l > 0) && (this.strippedText.charAt(l-1) === ' ')) {
this.strippedText = this.strippedText.substring(0,l-1);
}
},
lstrip: function() {
var l = this.strippedText.length;
if ((l > 0) && (this.strippedText.charAt(0) === ' ')) {
this.strippedText = this.strippedText.substring(1);
}
},
toText: function() {
return this.strippedText;
},
// TODO: What about locking of text objects?
getTextWordsCount: function() {
return this.nWords;
},
getLinkWordsCount: function() {
return this.isLink ? this.nWords : 0;
},
shouldPrune: function() {
return false;
},
toXml: function(doc,params) {
var text = doc.createTextNode(this.toText());
var elem = null;
if (this.isLink) {
elem = doc.createElement("a");
} else if (params && params.styleInfo) {
elem = doc.createElement("span");
}
if (elem) {
elem.appendChild(text);
if (params && params.styleInfo) {
params.styles = params.styles || [];
var foundStyle = Prototype.A.find(params.styles,function(prevStyle) {
return this.stylesCompatible(prevStyle.values,this.style);
},this);
if (!foundStyle) {
var n = params.styles.length;
foundStyle = {
id: n,
values: this.style
};
params.styles.push(foundStyle);
}
elem.setAttribute("style",foundStyle.id);
}
return elem;
}
return text;
},
stylesCompatible: function(styleA,styleB) {
var p;
for (p in styleA) {
if (styleA[p] !== styleB[p]) {
return false;
}
}
for (p in styleB) {
if (styleA[p] !== styleB[p]) {
return false;
}
}
return true;
}
});
DefaultTextExtractor.Paragraph = Prototype.Class.create(DefaultTextExtractor.Container,{
initialize: function($super,containingBlock,depth) {
$super(depth);
this.containingBlock = containingBlock;
this.nLinks = 0;
this.inLink = false;
},
addText: function(parentElement,text) {
if (this.isLocked()) {
throw "Cannot modify paragraph once scores have been calculated";
}
if (text !== undefined) {
var newText = new DefaultTextExtractor.Text(this, parentElement, text, this.inLink, false);
var nObjects = this.objects.length;
var prevSpace = true;
var remLeadingSpaces = true;
if (nObjects > 0) {
var lastText = this.objects[nObjects-1];
if ((lastText instanceof DefaultTextExtractor.Text) &&
lastText.isCompatibleWith(newText)) {
lastText.addText(text);
return;
}
var t = lastText.toText();
if (t.length == 0 || !Prototype.S.blank(t.substring(t.length-1))) {
prevSpace = false;
remLeadingSpaces = false;
}
// If the previous text ended with a blank and the
// current text begins with a blank, remove one of them
if (prevSpace && Prototype.S.blank(text.charAt(0))) {
// If the last text is a link, remove the space from that
if (lastText.isLink) {
lastText.rstrip();
remLeadingSpaces = false;
}
}
}
newText.setRemLeadingSpaces(remLeadingSpaces);
this.add(newText);
}
},
trimTrailingSpaces: function() {
if (this.isLocked()) {
throw "Cannot modify container once scores have been calculated";
}
var nObjects = this.objects.length;
prevSpace = true;
if (nObjects > 0) {
var last = this.objects[nObjects-1];
last.rstrip();
}
},
close: function() {
if (this.isLocked()) {
throw "Cannot modify container once scores have been calculated";
}
this.trimTrailingSpaces();
},
openLink: function() {
if (this.isLocked()) {
throw "Cannot modify container once scores have been calculated";
}
this.nLinks++;
this.inLink = true;
},
closeLink: function() {
if (this.isLocked()) {
throw "Cannot modify container once scores have been calculated";
}
this.inLink = false;
},
addLineBreak: function() {
if (this.isLocked()) {
throw "Cannot modify container once scores have been calculated";
}
this.trimTrailingSpaces();
this.add(new DefaultTextExtractor.LineBreak());
},
toXml: function($super,doc,params,name) {
if (name === undefined) {
name = "p";
}
return $super(doc,params,name);
},
shouldPrune: function($super) {
return $super() || this.score <= -5;
},
calcScore: function() {
this.score = 0;
var ntw = this.getTextWordsCount();
var nlw = this.getLinkWordsCount();
if (ntw > 0) {
this.score -= 10*nlw/ntw;
}
// Paragraph contains too many separate links (from daled)
if (ntw > 4 && nlw > 2) {
this.score -= 10*nlw/ntw;
}
this.score += (ntw - nlw)/4.0;
}
});
DefaultTextExtractor.LineBreak = Prototype.Class.create({
getTextWordsCount: function() {
return 0;
},
getLinkWordsCount: function() {
return 0;
},
toText: function() {
return "\n";
},
toXml: function(doc,params) {
return doc.createElement("br");
},
rstrip: function () {
},
lstrip: function() {
}
});
DefaultTextExtractor.Header = Prototype.Class.create(DefaultTextExtractor.Paragraph,{
initialize: function($super,containingBlock,depth,level) {
$super(containingBlock,depth);
this.level = level;
},
calcScore: function($super) {
if (this.level == 1) {
var ntw = this.getTextWordsCount();
var nlw = this.getLinkWordsCount();
this.score = 5 + (ntw+nlw)/2.0;
} else {
$super();
}
},
toXml: function($super,doc,params,name) {
if (name === undefined) {
name = "h";
}
var elem = $super(doc,params,name);
if (elem !== null) {
elem.setAttribute("level", this.level);
}
return elem;
}
});
DefaultTextExtractor.Block = Prototype.Class.create(DefaultTextExtractor.Container,{
initialize: function($super,depth,element,position,scrollPosition,rootDimensions) {
$super(depth,'\n\n');
this.maxScore = null;
this.rootDimensions = rootDimensions;
this.dimensions = null;
this.position = null;
this.visible = null;
this.nodes = {}
this.notInteresting = false;
if (element) {
this.dimensions = Prototype.E.getDimensions(element);
this.position = position || Prototype.E.cumulativeOffset(element);
// this is "1" because the first block holds the real width and height
if (depth == 1) {
this.rootDimensions = this.dimensions;
}
}
this.punish = this.calcPunishment();
if (this.punish > 3) {
this.notInteresting = true;
}
this.openPar();
},
getBestBlock: function() {
this.maxScore = 0;
var bestBlock = null;
var descBestBlock = null;
for (var iObject = 0; iObject < this.objects.length; ++iObject) {
var object = this.objects[iObject];
if (!(object instanceof DefaultTextExtractor.Block)) {
continue;
}
descBestBlock = object.getBestBlock();
if (descBestBlock !== null && this.maxScore < descBestBlock.maxScore) {
bestBlock = descBestBlock;
this.maxScore = descBestBlock.maxScore;
}
}
var score = this.calcLocalScore();
if (bestBlock === null || score > this.maxScore) {
bestBlock = this;
this.maxScore = score;
}
return bestBlock;
},
calcLocalScore: function() {
var nAllWords = this.getLocalTextWordsCount();
var nLinkWords = this.getLocalLinkWordsCount();
var nOnlyTextWords = nAllWords - nLinkWords;
var score = 0;
if (nLinkWords + nOnlyTextWords != 0) {
score = nOnlyTextWords / (nLinkWords + nOnlyTextWords) * Math.sqrt(nOnlyTextWords);
}
score = score/this.punish;
return score;
},
calcPunishment: function() {
var punish = 1;
// punish divs that don't start at initial top view
if (this.position.top > 800) {
var factor = (this.position.top - 800) / 100 + 1;
if (factor > 5) {
factor = 5;
}
punish = punish * factor;
}
if (this.rootDimensions !== undefined) {
// punish divs with width < 30%
if (this.dimensions.width < this.rootDimensions.width*0.3) {
// factor - between 1 and 2.5
var factor = (this.rootDimensions.width*0.3 - this.dimensions.width) / this.rootDimensions.width * 5 + 1;
if (factor > 3) {
factor = 3
}
punish = punish * factor;
}
}
return punish;
},
openBlock: function(element,elementId,position,scrollPosition) {
var b = new DefaultTextExtractor.Block(this.depth+1,element,position,scrollPosition,this.rootDimensions);
this.add(b);
this.nodes[elementId] = b;
return b;
},
getBlockByElement: function(elementId) {
return this.nodes[elementId];
},
addLineBreak: function() {
this.curPar.addLineBreak();
},
addText: function(parentElement,text) {
this.curPar.addText(parentElement,text);
},
openLink: function() {
this.curPar.openLink();
},
closeLink: function() {
this.curPar.closeLink();
},
getNormalParagraphs: function(paragraphDisqualifyingWords) {
return this.collectLocal(function(o) {
if (
(o instanceof DefaultTextExtractor.Paragraph) &&
!(o instanceof DefaultTextExtractor.Header)) {
return !o.containsAnyOf(paragraphDisqualifyingWords);
}
});
},
openHeader: function(level) {
this.curPar = new DefaultTextExtractor.Header(this, this.depth+1, level);
this.add(this.curPar);
if (level <= 2) {
this.addHeaderCandidate(this.curPar);
}
},
openPar: function() {
this.curPar = new DefaultTextExtractor.Paragraph(this, this.depth+1);
this.add(this.curPar);
},
closePar: function() {
this.curPar.close();
this.curPar = new DefaultTextExtractor.Paragraph(this, this.depth+1);
this.add(this.curPar);
},
shouldPrune: function($super) {
return $super() || this.score < 3;
},
toXml: function ($super,doc,params,name) {
if (name === undefined) {
name = "block";
}
var elem = $super(doc,params,name);
if (elem) {
if (!this.display) {
elem.setAttribute("d","0");
}
if (this.dimensions) {
elem.setAttribute("w",this.dimensions.width);
elem.setAttribute("h",this.dimensions.height);
}
if (this.position) {
elem.setAttribute("l",this.position.left);
elem.setAttribute("t",this.position.top);
}
if (this.viewportPosition) {
elem.setAttribute("vl",this.viewportPosition.left);
elem.setAttribute("vt",this.viewportPosition.top);
}
if (this.viewportDimensions) {
elem.setAttribute("viewWidth",this.viewportDimensions.width);
elem.setAttribute("viewHeight",this.viewportDimensions.height);
}
}
return elem;
}
});
DefaultTextExtractor.ExtractionTask = Prototype.Class.create({
initialize: function(doc,url,options,callbacks) {
this.doc = doc;
this.url = url;
this.options = options || {};
this.callNumber = 0;
this.filtered = {};
this.callbacks = callbacks || {};
this.initParams();
},
parseDocumentNode: function(block, node, offsetParentPosition, parentScrollPosition) {
if (this.isNodeStatusDone(node)) {
return true;
}
if (block.notInteresting || node === null || node.nodeType != Components.interfaces.nsIDOMNode.ELEMENT_NODE) {
this.markNodeStatusAsDone(node);
return true;
}
var now = new Date();
if (now.getTime() - this.phaseStart.getTime() > this.taskPhaseMax) {
return false;
}
var started = false;
if (this.isNodeStatusStarted(node)) {
started = true;
} else {
this.markNodeStatusAsStarted(node);
}
var position = null;
var scrollPosition = null;
var tag_type = DefaultTextExtractor.TAG_TYPES[node.nodeName.toLowerCase()];
if (tag_type == "skip") {
this.markNodeStatusAsDone(node);
return true;
} else if (tag_type == "anchor") {
if (!started) {
block.openLink();
}
if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
return false;
}
block.closeLink();
} else if (tag_type == "ignore") {
if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
return false;
}
} else if (tag_type == "par-start") {
if (!started) {
block.openPar();
}
if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
return false;
}
} else if (tag_type == "par") {
if (!started) {
block.openPar();
}
if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
return false;
}
block.closePar();
} else if (tag_type == "line-break") {
if (!started) {
block.addLineBreak();
}
if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
return false;
}
} else if (tag_type == "header") {
var level = node.nodeName[1];
if (!started) {
block.openHeader(level);
}
if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
return false;
}
block.closePar();
} else if (tag_type == "div") {
if (!started) {
var nodeId = this.setNodeId(node);
var newBlock = block.openBlock(node,nodeId,position,scrollPosition);
block = newBlock;
} else {
var nodeId = this.getNodeId(node);
if (!nodeId) {
nodeId = this.setNodeId(node);
}
block = block.getBlockByElement(nodeId);
if (!block) {
var newBlock = block.openBlock(node,nodeId,position,scrollPosition);
block = newBlock;
}
}
if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
return false;
}
} else {
if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
return false;
}
}
this.markNodeStatusAsDone(node);
return true;
},
parseDocumentNodeChildren: function(block, node, basePositionForChildren, scrollPosition) {
var children = node.childNodes;
for (var ci = 0; ci < children.length; ++ci) {
var child = children[ci];
switch (child.nodeType) {
case Components.interfaces.nsIDOMNode.ELEMENT_NODE:
if (!this.parseDocumentNode(block,child,basePositionForChildren,scrollPosition)) {
return false;
}
break;
case Components.interfaces.nsIDOMNode.TEXT_NODE:
case Components.interfaces.nsIDOMNode.CDATA_SECTION_NODE:
if (!this.isNodeStatusDone(child)) {
block.addText(node,child.nodeValue);
this.markNodeStatusAsDone(child);
}
break;
}
}
return true;
},
markNodeStatusAsDone: function(node) {
node.setUserData("glydoParsedNodeStatus","DONE",null);
},
markNodeStatusAsStarted: function(node) {
node.setUserData("glydoParsedNodeStatus","STARTED",null);
},
isNodeStatusDone: function(node) {
return node.getUserData("glydoParsedNodeStatus") == "DONE";
},
isNodeStatusStarted: function(node) {
return node.getUserData("glydoParsedNodeStatus") == "STARTED";
},
setNodeId: function(node) {
var id = Utils.uuid1();
node.setUserData("glydoUniqueId",id,null);
return id;
},
getNodeId: function(node) {
return node.getUserData("glydoUniqueId");
},
calcHeaderProbability: function(bestBlock, candidate, pageTitle) {
if (Prototype.S.strip(candidate.toText()).length == 0) {
return 0;
}
var score = 1;
var containingBlock = null;
// give higher score to h1 or h2
if (candidate instanceof DefaultTextExtractor.Header) {
score *= 4-candidate.level;
containingBlock = candidate.containingBlock;
} else {
// give higher score to larger fonts
var fontSize = Utils.getPixelsFromStyleSizeStr(candidate.style["fontSize"]);
score *= fontSize / 16.0;
containingBlock = candidate.container.containingBlock;
}
if (containingBlock.notInteresting) {
return 0;
}
var text = Prototype.S.strip(candidate.toText());
// if the candidate is a part of the page title it gives it a serious boost...
if (pageTitle !== null && (pageTitle.indexOf(text) != -1) && text.length > 10) {
score *= 3;
}
if (containingBlock == bestBlock) {
score *= 2;
} else {
// not too close to the top
if (containingBlock.position.top < 50) {
score /= 2;
}
// not too far down from the article (even though a little down is ok, cause there can be another
// block in between)
if (containingBlock.position.top - bestBlock.position.top > 100) {
score /= 3;
}
// not too to the side from the article
var horizDiff = containingBlock.position.left - bestBlock.position.left;
if (horizDiff > 50 || horizDiff < -200) {
score /= 3;
}
}
return score;
},
extractHeader: function(block, bestBlock, pageTitle) {
var headerList = block.collectHeaderCandidates();
var maxHeaderScore = 0;
var header = null;
if (headerList !== null) {
for (var hi = 0; hi < headerList.length; ++hi) {
if (!(headerList[hi] instanceof DefaultTextExtractor.Header) &&
!(headerList[hi] instanceof DefaultTextExtractor.Text)) {
continue;
}
var headerScore = this.calcHeaderProbability(bestBlock, headerList[hi], pageTitle);
if (headerScore > maxHeaderScore) {
maxHeaderScore = headerScore;
header = headerList[hi];
}
}
}
// If headline and page title match, we indicate a high certainty
if ((pageTitle !== null) && (header !== null) &&
(pageTitle.indexOf(header.toText()) != -1)) {
}
return ({header: header, headerCertainty: maxHeaderScore});
},
extractNormalParagraphs: function(block) {
// Get list of non-header texts
paraList = block.getNormalParagraphs(this.paragraphDisqualifyingWords);
return paraList;
},
extractNormalParagraphsAsXml: function(block,parent) {
var paraList = this.extractNormalParagraphs(block);
for (var ti = 0; ti < paraList.length; ++ti) {
var x = paraList[ti].toXml(parent.ownerDocument)
if (x !== null) {
parent.appendChild(x);
}
}
},
getContextItem: function(doc,rootBlock, bestBlock, pageTitle) {
var item = {
"@type": "text",
title: {
"__content": ""
},
body: {
"__content": ""
},
};
var ht = this.extractHeader(rootBlock, bestBlock, pageTitle);
item.title["@certainty"] = ht.headerCertainty;
if (ht.header !== null) {
item.title["__content"] = ht.header.toText();
}
var textDoc = doc.implementation.createDocument("","",null);
var body = textDoc.createDocumentFragment();
textDoc.appendChild(body);
this.extractNormalParagraphsAsXml(bestBlock,body);
item.body["@score"] = bestBlock.maxScore;
item.body["__content"] = body;
return item;
},
initParams: function() {
var defaults = ({
paragraphDisqualifyingWords:
[
"\u00A9",
"registered trademark",
"all rights reserved",
["inappropriate", "comments"],
["profanity", "comments"],
["terms", "conditions"]
],
taskPhaseMax: Prefs.task_phase_max,
totalTaskMax: Prefs.total_task_max,
taskBreak: Prefs.task_break,
});
this.setParameter("paragraphDisqualifyingWords",this.options,defaults);
this.setParameter("taskPhaseMax",this.options,defaults);
this.setParameter("totalTaskMax",this.options,defaults);
this.setParameter("taskBreak",this.options,defaults);
},
setParameter: function(name,params,defaults) {
var p = params[name];
if (p === undefined) {
p = defaults[name];
}
this[name] = p;
},
execute: function() {
this.callNumber++;
if (!this.rootBlock) {
this.taskStart = new Date();
this.rootBlock = new DefaultTextExtractor.Block(0,this.doc.documentElement);
}
this.phaseStart = new Date();
if (!this.parseDocumentNode(this.rootBlock, this.doc.documentElement)) {
var after = new Date();
if (after.getTime() - this.taskStart.getTime() > this.totalTaskMax) {
if (this.callbacks["notifyTaskFailed"]) {
this.callbacks["notifyTaskFailed"]("Text extraction took too much");
}
} else {
this.doc.defaultView.setTimeout(Prototype.F.bind(this.execute, this), this.taskBreak);
}
} else {
var after = new Date();
var bestBlock = this.rootBlock.getBestBlock();
var item = this.getContextItem(this.doc,this.rootBlock, bestBlock, this.doc.title);
if (this.callbacks["notifyContextItemExtracted"]) {
this.callbacks["notifyContextItemExtracted"](item);
}
if (this.callbacks["notifyTaskDone"]) {
this.callbacks["notifyTaskDone"]();
}
}
},
});